/******************************************************************************************************
*	data_prep.do	
* 	This scripts cleans, labels and prepares variables and saves the analysis data sets to be used in analysis.do
*	Author: Andreas Videbæk Jensen
*******************************************************************************************************/

***************************************************************
* Contents:                                                   *
* - Section 1: ELS:2002 data                                  *
* - Section 2: NELS:88 data                                   *
***************************************************************
* Note: The script uses the command "missings" which might be needed to be installed from the SSC. "ssc install missings"

/* OBS. Make sure to change Stata's working directory to the main folder of this replication package called "replication_files" */
 cd "(...)INSERT PATH TO FOLDER HERE (...)/replication_files"

****************************************************************************
**** Section 1: ELS:2002                     							****	
****************************************************************************	
    *** The ELS data is initially a wide dataset with one row per panelist. 
    *** This script finally exports the data to a long format with one row per panelist per year (2004 and 2008)
    
    /* Change delimiter to a semi-colon */
    #delimit;
    clear;
    /* Import selected variables from raw ELS data */
    use 
        BYS71E
        BYTXCSTD
        BYTXMQU
        BYTXRQU
        F2D15A
        F1SES2QU
        F2D15B
        F2D02P2
        F1RTRFLG
        F2D15C
        F2D15D
        F2D15E
        F2D15F
        BYSES1QU
        F2D15G
        BYS71F
        BYS41C
        BYS41E
        BYS41F
        BYS41G
        F2D05P2
        F1DOB_P
        BYS41H
        BYS41I
        F2EVRJOB
        F2C09
        F2D09
        F1FCOMP
        BYSEX
        F1SEX
        F1RACE
        F1SES1
        F1SES1QU
        F1SES2
        F1SES2QU
        BYTXCQU
        F1WRKHRS
        F3TZTRANRESP
        F3TZANYDEGRE
        F3TZHIGHDEG
        F3TZCOVERAGE
        F3TZMONENROL
        F3TZSTOPGT4M
        F3TZTOTLSTOP
        F3TZEVER4YR
        F3TZEVER2YR
        F3MOBILITYF1F2
        STU_ID
        STRAT_ID
        BYS86H
        F1S64I
        PSU
        BYSES1QU
        F1SCH_ID
        F1UNIV1
        F1UNIV2A
        F1UNIV2B
        F2UNIV_P
        F3UNIV
        F3UNIVG10
        F3UNIVG12
        G10COHRT
        G12COHRT
        F3PS1RETAIN
        F3JUNEDSTAT
        F3EVR4YR
        F3EVR2YR
        BYSES1
        //Voting:
        F2D13	//Voted in 2004 Presidential election
        F3D38	//Whether voted in the 2008 presidential election
        
        //civic particip
        BYS44C
        F1S39C
        F1S62
        F1D68
        F2D09
        F3D40
        F2D12
        F3D39
        F2HHPAR
        F3EVRLT2
        F2PS0411
        F3HSSTAT
        F3PS1START
        F3TZMONENROL
        F3EDSTAT
        F3PS2BA
        
        //Military
        F3B04
        F3B01 
        F3B05
        
        F2F1HSST //High school status by summer 2004
        F2EVRATT 
        F3HSCPDR // High school completion year
        F3TZBACH1DT //Transcript: Date of first known bachelor's degree earned
        F3TZASOC1DT // ---||--- associate degree
        F3TZPS1START // Start 
        F3TZLSTYRENR // Transcript: Last year of known enrollment
        F3TZTRANRESP
        F3ATTAINMENT
        F3ATTAINDATE
        F3EVRATT
        F3A01D
        F2EDLEVL
        
        F1PSEPLN 
        
        F2PSPR_4
        F2PSEJ06
        F2PS0401 //Enrolled in postsecondary institution in - 2004/01 (January 2004)
        F2PS0402
        F2PS0403
        F2PS0404
        F2PS0405
        F2PS0406
        F2PS0407
        F2QWT
        F3QWT
        F2PS0408
        F2PS0409
        F2PS0410
        F2PS0411
        F2PS0412
        F2PS0501
        F2PS0502
        F2PS0503
        F2PS0504
        F2PS0505
        F2PS0506
        F2PS0507
        F2PS0508
        F2PS0509
        F2PS0510
        F2PS0511
        F2PS0512
        F2PS0601
    using "els_02_12_byf3pststu_v1_0.dta", clear;
    #delimit cr 

*** 1.1 Recoding voting variable and other basic variables
    ** 1a) Voting variable:
        recode F2D13 (-9 -8 -7 -4 = .) (0 = 0 "No") (1 = 1 "Yes") , gen(voted04)
        recode F3D38 (-9 -8 -7 -4= . ) (0 = 0 "No") (1 = 1 "Yes") , gen(voted08)

    ** 1b) F3HSCPDR High school completion year: Cleaning variable and create "hscompl_year" with labels
        recode F3HSCPDR (-9 = .a "Missing") (-3 = .b "Item legitimate skip/NA"), gen(hscompl_year)


*** 1.2 CollegeGoing: Measuring college enrollment between 2004 and 2008 
    * The dataset gives us acces to two sources of this information (to triangulate out missing values)
    * (1) Survey responses from F2 survey in 2006, (2) survey responses as wel as official Post-secondary (ps) transcript data from Wave 3 in 2012

    * This allows us to construct the following three indicators (a, b and c)
        *	(a) at least 1 month of PS enrollment between november 2004 and january 2006. (f2 survey method)
        *   (b) attenders of college in any of the years 2005, 2006, 2007 and 2008 (f3 transcript + survey method 1)
                * - F3TZPS1START F3TZLSTYRENR F3PS1START
        *	(c) Those who attained a BA degree before t2 (F3 transcript + survey method 2) 

    * These variables combine into a measure of whether or not the student attended college between elections in 2004 and 2008.

    ***1.2.1 First enrollment indicator (a) (F2 survey method) /*
        Should measure whether panelist had at least 1 months of PS enrollment between november 2004 and january 2006.
            - We use the variable of F2EVRATT/F2EDLEVL that asks in F2 2006 whether student ever attended post-secondary education.
                - For respondents who answered yes, we have data on enrollment for each month between nov 2004 and 2006 */

        * generating variable evr_enrolled_between_0411_0601
        * respondents who never attended college as of the second followup in 2006 , did not have at least one month of enrollment
        recode F2EDLEVL (1 2 3 4 = 0 "No") (5 6 = 2 "Maybe") (-4 -8 -9 = .) ,gen(evr_enrolled_between_alt)
        recode F2EVRATT (0 = 0 "No") (1 = 2 "maybe") (439 = 1 "yes") (-9 -8 -4 -3  = .) , gen(evr_enrolled_between_0411_0601)

        recode evr_enrolled_between_0411_0601 (. = 0) if evr_enrolled_between_alt == 0
        recode evr_enrolled_between_0411_0601 (. = 1) if evr_enrolled_between_alt == 1
        drop evr_enrolled_between_alt

        * Generate variable counting months enrolled in 3+ year inst between nov 2004 and jan 2006 (Obs: note that respondents with no enrollment at all by F2 are missing on this variable)
        foreach month_var of varlist  F2PS0411 F2PS0412 F2PS0501 F2PS0502 F2PS0503 F2PS0504 F2PS0505 F2PS0506 F2PS0507 F2PS0508 F2PS0509 F2PS0510 F2PS0511 F2PS0512 F2PS0601 {
            recode `month_var'	(2 4 = 1 "enrolled in higher ed") ///
                                (3 = 0 "not enrolled in higher ed") /// less than 2-year institution is not considered higher ed
                                (-3 -4 -8 -9 = .), gen(`month_var'_bin)
        }

        *Summing over month variables (We treat missing as zero)
        egen ps_months0411_0601 = rowtotal(F2PS0411_bin F2PS0412_bin F2PS0501_bin F2PS0502_bin F2PS0503_bin F2PS0504_bin F2PS0505_bin F2PS0506_bin F2PS0507_bin F2PS0508_bin F2PS0509_bin F2PS0510_bin F2PS0511_bin F2PS0512_bin F2PS0601_bin)
        missings tag F2PS0411_bin F2PS0412_bin F2PS0501_bin F2PS0502_bin F2PS0503_bin F2PS0504_bin F2PS0505_bin F2PS0506_bin F2PS0507_bin F2PS0508_bin F2PS0509_bin F2PS0510_bin F2PS0511_bin F2PS0512_bin F2PS0601_bin, generate(no_of_miss_months)
        generate one_month = !inlist(ps_months0411_0601,0) //enrolled at least one month

        * Coding students that were enrolled or not enrolled in the period:
        replace evr_enrolled_between_0411_0601 = one_month if inlist(no_of_miss_months, 0) | one_month == 1

        *remaining students are those who are completely missing on the monthly enrollment and thus has to be coded missing
        recode evr_enrolled_between_0411_0601 (2=.) if inlist(no_of_miss_months,15) 
        * First indicator(a) is done. 

    *** 1.2.2 Second enrollment indicator (b) (F3 method 1: transcript + survey) /*
	    - started college in one of the years 2005, 2006, 2007 and 2008 according to official transcripts or self report. Or ended PS enrollment between 2005 and 2008.
		- variables: F3TZPS1START F3TZLSTYRENR	 */

        * respondents who never attended college as of the third follow up 2012 , we can be sure, did not attend college in any of the years 2005-2008:
        * THe variables F3EDSTAT and F3EVRATT applies to all F3 respondents
        recode F3EVRATT (0= 0 "No") (1 = 2 "Maybe (i.e. ever attended)") (-4 -8 = .) ,gen(evr_enrolled_between_2005_20_alt)
        recode F3EDSTAT(6 = 0 "No") (1 2 3 4 5 = 2 "Maybe (i.e. ever attended)") (-4 -8 = .) (235 = 1 "Yes") ,gen(evr_enrolled_between_2005_2008)

        //tab evr_enrolled_between_200* , mis //The variables are identical. 
        drop evr_enrolled_between_2005_20_alt

        * The variable F3PS1START was asked to all students with F3EVRATT == 1, and missing otherwise. I.e. it was asked to all who are coded to maybe, in indicator 2 atm:
        recode F3PS1START (-3 -4 -8 -9 = .)

        * Coding students that started college in this period
        replace evr_enrolled_between_2005_2008 = 1 if inlist(F3PS1START, 2005, 2006, 2007, 2008) & (inlist(F3EVR4YR, 1) | inlist(F3EVR2YR, 1))
        * OBS People who started in 2009 or later, had no enrollment
        replace evr_enrolled_between_2005_2008 = 0 if inlist(F3PS1START, 2009, 2010, 2011, 2012)

        * OBS: never attending any institution is considered no higher educational
        * I.e. Extending population of variable to those who never attended college
        replace evr_enrolled_between_2005_2008 = 0 if F3TZTRANRESP == 0
        
        * This completes the part of the second indicator (b) based on the survey data. To do the transcript part, we use a variable based on data on number of years in college.

        *** 1.2.2.1 intermediate variable "years_attended": Measuring time spent in college.
            * The years_attended-variable measures a count of how many of the years between 2004 and 2008 elections where the respondent attended college
            * The transcript data from post-secondary enrollments allow us to measure this more fine grained education variable.
            *	- 	PETS were requested for each of the ELS sample members who reported in the student interview that they had attended an institution that is 
            *		part of the Integrated transcript system (IPEDS) (i.e. 12,549 resps) (addendum page 2)
            
            * Method:  Differencing these variables: /*
                F3TZPS1START		Transcript: First attended ever month/year
                F3TZLSTYRENR		Transcript: Last year of known enrollment */

            recode F3TZPS1START (-9 -8 -4 = .) , gen (start_year)
            recode F3TZLSTYRENR (-8 -4 = .) , gen (end_year)

            recode start_year (2001 2002 2003 2004 2005 = 2005) (2009 2010 2011 2012 = 2009), gen(start_year_within_period) //Starters before 2005 have 2005 as their first countable year. 
            recode end_year (2004 2005 = 2005) (2008 2009 2010 2011 2012 = 2008) , gen(end_year_within_period) // 2008 is the last countable year

            *Calculating difference:
            gen years_attended = end_year_within_period - start_year_within_period + 1
            bysort years_attended: tab start_year end_year, mis
            * OBS1: never attending a 2 or more years instition is  considered no higher education
            replace years_attended = 0 if (inlist(F3TZEVER4YR, 0) & inlist(F3TZEVER2YR, 0)) // Coding less than 2 year as non-college
            * OBS2: never attending any institution is considered no higher educational. (use ever-attended variables:)
            recode F3EVRATT (-4 -8 = .)
            replace years_attended = 0 if F3EVRATT == 0
            * and use transcript respondent status. F3TZTRANRESP
            replace years_attended = 0 if F3TZTRANRESP == 0
            * Intermediate variable is done. 

        * Resume coding the transcript part of second indicator (b):
        recode evr_enrolled_between_2005_2008 (. = 0) (2 = 0) if years_attended == 0 //respondents who did never attend college according to transcripts should not be missing or "maybe"

        replace evr_enrolled_between_2005_2008 = 1 if inlist(years_attended, 1, 2, 3, 4)

        * The remaining respondents in the "maybe" category had some enrollment, but it can not be determined WHEN this enrollment was. they are therefore coded to missing. 
        recode evr_enrolled_between_2005_2008 (2 = .)
        
        *Second indicator (b) is done.

    *** 1.2.3 Combining first and second indicator (a) and (b)
        gen CollegeGoing = evr_enrolled_between_2005_2008 
        label define CollegeGoing 0 "No College" 1 "Some College"
        label values CollegeGoing CollegeGoing
        replace CollegeGoing = 1 if evr_enrolled_between_0411_0601 == 1 // respondents that are positive on the first indicator should take the value of 1 (we know they attended college in the period)

    *** 1.2.4 Third enrollment indicator (c) - bachelor achievement (F3 method 2: transcript) /*
        - We want a measure of whether a person had achieved a college bachelor by end of 2008 (=1), and 0 if not. (s_bachelor_between_2005_2008)
        - This can be measured in 2 ways:
            - A F3 Survey method
            - A F3 Transcript method

        * 1.2.4.1 Survey Method BA achievement
            - People who by 2012 had highschool or lower as highest attainment, should have the value of zero. (we are positive that they did not have a bachelor)*/
            recode F3ATTAINMENT (-8 -4 = .) (1 2 3 4 5 = 0 "Less than bachelor") (6 7 8 10 = 2 "Maybe (ever achieved)") (4845 = 1 "recieved bachelor"), gen(s_bachelor_between_2005_2008)
            recode F3ATTAINMENT (-8 -4 = .) 

            **Recieving your highest educational attainment in 2008 or earlier, means that we are positive that you finished your bachelor in 2008 or earlier:
            recode F3ATTAINDATE (-9 -8 -4 -3 = .)
            recode s_bachelor_between_2005_2008 (2 = 1) if inlist(F3ATTAINDATE, 2005, 2006, 2007, 2008)

            ** If highest recieved is Bachelor AND it was recieved 2009 or later, we are positive the respondent did not have a bachelor in 2008 or earlier (and should therefore be coded to 0):
            recode s_bachelor_between_2005_2008 (2 = 0) if inlist(F3ATTAINMENT,6) &inlist(F3ATTAINDATE,2009, 2010, 2011, 2012)

            * For those who attained bachelor or higher by 2012, IF we do not know the date of completion (F3ATTAINDATE) we are not sure if they had recieved bachelor or not by 2008, and therefore code to missings
            recode s_bachelor_between_2005_2008 (2 = .) if missing(F3ATTAINDATE)

            *If they received a master or higher in 2009, it is extremely unlikely that they had not recieved a bachelor by 2008 (we code them to 1):
            recode s_bachelor_between_2005_2008 (2 = 1) if inlist(F3ATTAINMENT,8, 10) & inlist(F3ATTAINDATE,2009)
         
            ** For the remaining students the survey data is not fine grained enough to securely place them as having either attained or not attained bachelor by 2008. Thus we code them to missing
            recode s_bachelor_between_2005_2008 (2 = .) //9 out of 10 seem to be caught with transcript data as we will see

        * 1.2.4.2  Transcript method BA achievement /*
            * This indicator should measure whether the person had attained a bachelors degree in 2008 or earlier ( = 1 "recived BA by 2008"). People who attained it later or never attained should be coded (0 "less than BA by 2008")
            * The transcript data from post-secondary enrollments allow us to measure this more fine grained education variable.
            *	- 	PETS were requested for each of the ELS sample members who reported in the student interview that they had attended an institution that is 
            *		part of the Integrated transcript system (IPEDS) (i.e. 12,549 	resps) (addendum page 2) */


            * The following variable was asked to all transcript respondents with a known BA from transcript
            // codebook F3TZBACH1DT, tab(20) //Transcript: Date of first known bachelor's degree earned
            recode F3TZBACH1DT (-9 -8 -4 -3 = .), gen(t_date_of_first_known_BA)

            recode t_date_of_first_known_BA (2007 2008 = 1 "Recieved bachelor") (2009 2010 2011 2012 2013 = 0 "Less than bachelor"), gen(t_bachelor_between_2005_2008) 

            label var s_bachelor_between_2005_2008 "survey: recieved BA 2005-2008 (RECODE of F3ATTAINMENT)"
            label var t_bachelor_between_2005_2008 "transcript: recieved BA 2005-2008 (RECODE of F3TZBACH1DT)"

            **** Note that we are confident about those who did never attend post secondary
            replace t_bachelor_between_2005_2008 = 0 if F3EVRATT == 0
            replace t_bachelor_between_2005_2008 = 0 if F3TZTRANRESP == 0

            * If highest attained degree according to transcripts are below BA, we can code to zero:
            recode *HIGH* (-9 -8 -4 -3 = .)
            replace t_bachelor_between_2005_2008 = 0 if inlist(F3TZHIGHDEG,1,2)

            * Combine transcript and survey part of third indicator (c)
            gen received_BA = s_bachelor_between_2005_2008
            label values received_BA s_bachelor_between_2005_2008
            replace received_BA = 1 if t_bachelor_between_2005_2008 == 1 // respondents that are positive on the transcript indicator should take the value of 1 (we know they received BA in the period)

        * Third indicator (c) is done.

    *** 1.2.5 Combining third indicator (c) with the variable CollegeGoing which combined first and second indicator (a) and (b) 
        replace CollegeGoing = 1 if received_BA == 1 
        recode CollegeGoing (1 = .) if missing(received_BA)

    * Variable (CollegeGoing) measuring college enrollment between 2004 and 2008 is done. 
   

*** 1.3 Highschool status by t1 (2004)
        gen non_hs_grad_2004 = (inlist(hscompl_year,2009, 2008,2007,2006,2005))
        replace non_hs_grad_2004 = 1 if inlist(F3HSSTAT,7,8,9)

*** 1.4 Year of birth / age
    recode  F1DOB_P (-9 -8 = .)
    gen eligible_voter_2004 = inrange(F1DOB_P,198300, 198610)
    egen monthdiffs_birth = group(F1DOB_P)

*** 1.5 Background variables 

    *Voting wave 1:
    gen m_voteW1 =voted04

    *Parental SES
    clonevar m_SESW0 = F1SES1QU
    recode m_SESW0 (-8 -4 = .) 

    *Age
    tostring F1DOB_P, gen(m_birth_year)
    replace m_birth_year = substr(m_birth_year,1,4)
    destring m_birth_year, replace

    *Sex
    recode F1SEX (2= 0 "female") (1 = 1 "male"), gen(m_sex)

    *test_scores
    clonevar m_test_score_quarts = BYTXCQU
    recode m_test_score_quarts (-8 = .)

    *Race
    clonevar m_race = F1RACE 

    *Pre-college cognitive ability: 10th grade math skills (standardized test quartiles) as measure of cognitive ability
    recode BYTXMQU (-8 = .)
    clonevar rr_dyn_cog =  BYTXMQU

    *** Parental Education, income and occupation (SES quartiales)
    clonevar rr_dyn_ses = m_SESW0

    *** High school language test-scores: 10th grade reading skills (standardized test quartiles)
    recode 	BYTXRQU (-8 = .)
    clonevar rr_dyn_read =  BYTXRQU

*** 1.6 Missingness on independent and dependent variables:
    missings tag voted04 voted08 CollegeGoing, generate(mis_on_voting_and_cg)
    recode mis_on_voting_and_cg (1 2 3 4 5 = 1)

*** 1.7 calculating propensity scores for going to college and dropping out of the survey
    rename m_race m_race_temp
    recode m_race_temp (4 5 = 1 "Hispanic") (1 3 6 = 2 "Other") (2 =3 "Asian") (7 = 4 "White non-hispanic") ,gen(m_race) // Hispanic, White non-hispanic, Asian, Other
    gen misvot04 = missing(voted04)
 
    *going to college
    logit CollegeGoing i.m_race i.m_sex i.rr_dyn_ses i.rr_dyn_ses i.voted04, robust
    predict pscore, pr 
    xtile pscore_deciles = pscore, nquantiles(10)

    *dropping out of the survey
    logit mis_on_voting_and_cg i.m_race i.m_sex i.rr_dyn_ses i.rr_dyn_ses i.voted04, robust
    predict pscore_dropout, pr 
    xtile pscore_dropout_halfs = pscore_dropout, nquantiles(2)


*** 1.8 Timevarying covariates:
    *  Life events
        recode F2D15A F2D15B F2D15C F2D15D F2D15E F2D15F F2D15G (-9 -8 -7 -6 -5 -4 -3 -2 -1 = .)
        recode F2D02P2 (-9 -8 -7 -4 -3 2002 2003 2004= 0) (2005 2006 = 1)
        rename F2D02P2  got_married_0506
        /*
            -	G Crime: You were the victim of a violent crime
            -	E Health: You became seriously ill or disabled
            -	XX Job loss: You lost or quit a job
            -	XX Becoming a parent: You became a parent
            - 	A Parental divorce: Your parents or guardians got divorced or separated
            -	B Parental job loss: One of your parents or guardians lost his or her job
            -	F Family health: A family member became seriously ill or disabled
            -	C Parent death: One of your parents or guardians died
            -	D Family/friend death: A close relative or friend died  */

        * A-G:
        rename F2D15* =_0506
        
        recode F1WRKHRS (-9 -8 -4 = .) (0 = 0 "Did not work during 03-04 school year") (1 2 3 4 5 6 7 8 9 = 1 "Did work for pay"), gen(job_pre)
        recode F2EVRJOB (-8 -4 -3 = .)

        *Becoming a parent: You became a parent
        recode F2D05P2 (-9 -8 -7 -4 -3 2001 2002 2003 2004= 0 "No") (2005 2006 = 1 "Yes"), gen(became_parent_0506)

*** 1.9 Measuring future college attendance (after 2008)  to be used in placebo analysis and restricted comparison group analyses./*
    *** 1.9.1 Generating variable placebo_collegeGoing, measuring college education between 2008 and 2012
        - future attendance = placebo treatment
        - future attendants = restricted comparison group.

        What we need:
        - A variable which is only non-missing for panelists who received no education between 2004 and 2008. (i.e. Within untreated group of main study)
        - A variable which measures of whether these panelists attained some college after 2008 or not. 

        We utilize information from the following variables:
            F3EVRATT  
            F3EDSTAT 
            F3PS1START
            F3TZTRANRESP
            F3TZPS1START		Transcript: First attended ever month/year
            F3TZLSTYRENR		Transcript: Last year of known enrollment */

        * The variables F3EDSTAT and F3EVRATT applies to all F3 respondents
            recode F3EDSTAT (-8 -4 =.)

        *Generating variable.
            gen placebo_collegeGoing = 0
            replace placebo_collegeGoing = . if CollegeGoing != 0 // restrict variable to respondents from the main study's control group 
            label variable placebo_collegeGoing "College education between 2008 and 2012"
            label values placebo_collegeGoing CollegeGoing

        * We build the indicator from three sources of information: 
            * (a) Survey-based information (F3EDSTAT)  
            * (b) Transcript-based information 
            * (c) Bachelor achievement 

        * part a: Survey-based information (F3EDSTAT)  
            ***** Respondents who where "currently attending" college in 2012 are at least "Some College"
            replace placebo_collegeGoing = 1 if inlist(F3EDSTAT,1,2)
            ***** Respondents who had previous college attendance in 2012 are at least "Some College". 
            replace placebo_collegeGoing = 1 if inlist(F3EDSTAT,5) & (inlist(F3EVR4YR, 1) | inlist(F3EVR2YR, 1))
            ** Respondents who ever attended a 2 year or ever attended a 4 year institution are at least "some college":
            replace placebo_collegeGoing = 1 if (inlist(F3EVR4YR, 1) | inlist(F3EVR2YR, 1))

        * part b: Transcript-based information 
            recode F3TZLSTYRENR F3TZPS1START   (-3 -4 -8 -9 = .)
            ** Respondents who ever attended a 2 year or ever attended a 4 year institution are at least "some college":
            replace placebo_collegeGoing = 1 if (inlist(F3TZEVER4YR, 1) | inlist(F3TZEVER2YR, 1))


        * part c: BA Achievement
            * We want a measure of whether a person achieved a college bachelor between 2008 and 2012. *This can be measured in 2 ways
                * A F3 Survey method (F3Attainment)
                * A F3 Transcript method (F3TZBACH1DT and F3TZHIGHDEG)

            *** People attained college by 2012 if their highest level of education earned as of F3 is bachelor or above. 
            replace placebo_collegeGoing = 1 if inlist(F3ATTAINMENT, 6,7,8,10)

            recode F3TZBACH1DT (-9 -8 -4 -3 = .)
            *If the first known bachelor degree in the transcript was attained after 2009, they should be coded completed BA on the placebo-treatment variable
            replace placebo_collegeGoing = 1 if inlist(F3TZBACH1DT,2009,2010,2011,2012,2013)

            *If the highest known degree attained as of June 2013 is bachelor or above, they should be coded completed BA on the placebo-treatment variable
            codebook F3TZHIGHDEG, tab(20)
            replace placebo_collegeGoing = 1 if inlist(F3TZHIGHDEG,3,5,7)

            *restrict placebo-variable to respondents who are in the main study control group)
            replace placebo_collegeGoing = . if CollegeGoing != 0
            
        * The variable placebo_collegeGoing is done.
    
    *** 1.9.2  Further restrictions on timing on future college attendance, based on how close to 2008 the late-attenders attend college. 
        *We use this variable: F3PS1START Year/month first attended a postsecondary institution
        gen in_restr_control_group = (CollegeGoing == 0) & placebo_collegeGoing == 1  

        gen future_was_atmost_2009 = (in_restr_control_group) & (inlist(F3PS1START,2009) | inlist(F3TZPS1START,2009))
        gen future_was_atmost_2010 = (in_restr_control_group) & (inlist(F3PS1START,2009, 2010) | inlist(F3TZPS1START,2009, 2010))
        gen future_was_atmost_2011 = (in_restr_control_group) & (inlist(F3PS1START,2009, 2010, 2011) | inlist(F3TZPS1START,2009, 2010,2011))
        gen future_was_atmost_2012 = (in_restr_control_group) & (inlist(F3PS1START,2009, 2010, 2011, 2012,.) | inlist(F3TZPS1START,2009, 2010,2011,2012,.))


    *** 1.9.3 Restricted treatment variable comparing only college-goers to future college-goers:
        clonevar restr_treatment_level_at_t2 = CollegeGoing
        recode restr_treatment_level_at_t2 (0 = .) if placebo_collegeGoing == 0


*** 1.10 Preparing variable names for reshaping from wide to long format, before exporting analysis data
    sort STU_ID
    rename voted04 voted4 
    rename voted08 voted8

    rename job_pre paidwork4 
    rename F2EVRJOB paidwork8

*** 1.11 Reshaping data to long format
    reshape long voted paidwork, i(STU_ID) j(year)

    label variable voted "Whether respondent voted"
    label variable paidwork "Whether respondent had a paid job"

*** 1.12 Recoding time-varying variables:
    * PostPeriod (measuring whether observation relates to 2004 and 2008 election)
    recode year (4 = 0) (8 = 1), gen(PostPeriod)
    label variable PostPeriod "1 if 2008"
    label define PostPeriod 0 "2004" 1 "2008"
    label values PostPeriod PostPeriod
    drop year

    * Residential mobility:
        *codebook F3MOBILITYF1F2, tab(5000) //Distance (in miles) between the respondent's first follow-up residence and their second follow-up residence
        recode F3MOBILITYF1F2 (-9 = .), gen(res_mob_at_t2)
        gen res_mob = res_mob_at_t2*PostPeriod
        egen res_mob_cat = group(res_mob)
        egen res_mob_at_t2_cat = group(res_mob_at_t2)
        gen byte no_move_at_t2 = (res_mob_at_t2 == 0) if res_mob_at_t2 < .
        gen log_res_mob_at_t2 = log(res_mob_at_t2)
        sum log_res_mob_at_t2, meanonly
        display r(min)
        replace log_res_mob_at_t2 = r(min) if no_move_at_t2 == 1
        //Reintroducing temporal variation:
        gen xs_log_res_mob = log_res_mob_at_t2*PostPeriod
        gen xx_no_move = no_move_at_t2*PostPeriod
    
    * Bad life events:
        gen bb_par_died = F2D15C*PostPeriod
        gen bb_accident = F2D15E*PostPeriod
        gen bb_par_div = F2D15A*PostPeriod
    
    * Military service
        // Military service (Relevant variables:  F3B01 F3B04 F3B05)
        gen served_military = (F3B01 == 1)
        gen pretreatment_military_service = inlist(F3B04,2003,2004)
        gen military_service_at_t2 = inlist(F3B04,2005,2006,2007,2008)
        gen timevarying_military_service = military_service_at_t2*PostPeriod   

    * Living with parents
        * whether lives with parent/guardian 2002/2004:
        recode F1FCOM (1 2 3 4 5 6 7 8 = 1 "Yes") (9 = 0 "No"), gen(lives_with_par04)
        label variable lives_with_par04 "Whether sample member lived with parents in 2004"
        * whether lives with parent/guardian 2006:
        recode F2HHPAR (-9 -8 -4 = .)
        rename F2HHPAR  lives_with_par06
        clonevar lives_with_par06_adjusted = lives_with_par06
        replace lives_with_par06_adjusted = 1 if lives_with_par04==0 & lives_with_par06==0
        recode lives_with_par06_adjusted (0 =1) (1=0), gen(moved_away_from_parents06)
        gen lives_with_par_mod = .
        replace lives_with_par_mod = moved_away_from_parents06*PostPeriod
        label define livewithparmod 1 "Stopped living with parents"
        label values lives_with_par_mod livewithparmod

    * Got married
        gen xx_gotmarried = got_married_0506*PostPeriod

    * Additional Timevarying covarites (life events)
        //Removing suffix
        rename *0506 *
        rename F2PS F2PS0506

        *Creating Timevarying covarites
        foreach eventvar of varlist F2D15A_ F2D15B_ F2D15C_ F2D15D_ F2D15E_ F2D15F_ F2D15G_ became_parent_ {
            
            replace `eventvar' = `eventvar'*PostPeriod
	    }

*** 1.13 Time-invariant variables:
    * Voting in 2004
        recode F2D13 (-9 -8 -7 -4 = .) (1 = 1 "Voted in 2004") (0 = 0 "Did not vote in 2004"), gen(voted_in_PREPERIOD)

    * socio-economic status for interaction analyses
        clonevar ses_q = F1SES1QU 
        recode ses_q (1 = 0 "lowest quartile") (2 3 4 = 1 "second quartile or above"), gen(ses_bin)
        recode ses_bin (0 = 1 "lowest quartile") (1 = 0 "second quartile or above"), gen(low_ses)

    * Binary race variable for interaction analysis:
        recode m_race (1 2 = 1 "Hispanic, Black, Other") (3 4 = 0 "White (non-hispanic) or Asian"), gen(non_white_asi)


*** 1.14 Loading weights from genetic and propensity score matching performed in R using WeightIt and MatchIt packages 
    *Merge weights:
    merge m:1 STU_ID using "matching_weights.dta", keep(master match) keepusing(weights_all_pscore weights_notvc weights_tvc)


    * Above, we load the pre-calculated weights from the file "matching_weights.dta".  /*
    * This is because the calculation of the genetic matching is computationally heavy and time-consuming. However,  
    * the replication code for producing these weights is found in the R-script "matching.R" in this replication package.  */
    *       Running matching.R produces three files with weights: "pscore_weights.dta", "genetic_weights_notvc.dta"
    *       and "genetic_weights_tvc.dta". The merge statement above (which loads in weights) would need to be adjusted 
    *       and repeated for each of these files,  after running "matching.R".

*** 1.15 The key treatment indiciator (treatment_level_at_t2) based on CollegeGoing variable:
    * This variable is not varying over time. 
    rename CollegeGoing treatment_level_at_t2

*** 1.16 Setting panel data structure with XT-framework and saving
    order STU_ID PostPeriod voted treatment_level_at_t2
    xtset STU_ID

    save "analysis_data_ELS.dta" , replace

****************************************************************************
**** Section 2: NELS:88                     							****	
****************************************************************************	
    *** The NELS:88 data is initially a wide dataset with one row per panelist. 
    *** This script finally exports the data to a long format with one row per panelist per year (1992 and 1996)

    /* Change delimiter to a semi-colon */
    #delimit;
    clear;
    /* Load selected variables from raw NELS:88 data */
    use 
        F2SEX
        F2RACE1
        F2SES1Q
        BY2XMQ
        VOTEPRES
        F4IVPRE
        F4ED1
        F4ED2
        STU_ID
        F1BIRTHY
        F2BIRTHY
        BIRTHYR
        F4ED3
        F4ED4
        F4ED5
        F4ED6
        F4EDGR2 
        F4EDGR3 
        BIRTHMO
        F1BIRTHM
        F2BIRTHM
        F4EDGR4 
        F4EDGR5 
        F4EDGR6
        PSELASTY
        F4HHDG
        F4EDGR1
        TALKPARN
        BYPARED
        NUMINST
        NUMINST
        F1PARED
        F1BIRTHM
        F4ATTPSE
        F1BIRTHY
        F4JRDVA
        F4JRDVB
        F3NUMINT
        F3PSENUM
        F3ATTEND
        F3PSEATN	
        
        ENRL0692
        ENRL0792
        ENRL0892
        ENRL0992
        ENRL1092
        ENRL1192
        ENRL1292
        ENRL0193
        ENRL0293
        ENRL0393
        F4EFSECT
        ENRL0493
        ENRL0593
        ENRL0693
        ENRL0793
        F4ELSECT
        ENRL0893
        ENRL0993
        F4ELMY
        ENRL1093
        ENRL1193
        F4EFMY
        ENRL1293
        ENRL0194
        ENRL0294
        ENRL0394
        RACEGREW
        RACEPRES
        ENRL0494
        ENRL0594
        ENRL0694
        ENRL0794
        ENRL0894	
    using "NELS_88_00_BYF4STU_V1_0.dta", clear;

    #delimit cr 
 	
*** 1.1 Voting variables:
    recode VOTEPRES (2 = 0 "No") (1 = 1 "Yes") (-9 -8 -7 -6 -5 -4 -3 -2 -1 = .) , gen(voted92)
    recode F4IVPRE (0 = 0 "No") (1 = 1 "Yes") (-9 -8 -7 -6 -5 -4 -3 -2 -1 = .) , gen(voted96)

*** 1.2 Socio-economic status:
    recode F2SES1Q (8 9 = .)

*** 1.3 Cognitive ability:
    
    recode BY2XMQ (6 8 9 = .), gen(rr_dyn_cog)

*** 1.4 Self-reported race
    recode F2RACE1 (2=1 "Hispanic") (3 5 = 2 "Other") (1=3 "Asian") (4=4 "White, non hispanic") (8 =.), gen(m_race)

*** 1.5 CollegeGoing: Measuring college enrollment between 1992 and 1996 

    * The dataset gives us acces to two sources of this information (to triangulate out missing values)
    * (1) Survey responses from F3 survey in 1994, (2) survey responses as wel as official Post-secondary transcript data from F4 in 2000

    * This allows us to construct the following three indicators (a), (b) and (c)
        *	(a) at least 1 month of PS enrollment between november 1992 and january 1994. (F3 survey method)
        *   (b) attenders of college in any of the years 1993, 1994, 1995 and 1996 (F4 transcript + survey method 1)
                * - F3TZPS1START F3TZLSTYRENR F3PS1START
        *	(c) Bachelor attainment before second election (F4 survey method 2) 

    * These variables combine into a measure of whether or not the student attended college between elections in 1992 and 1996.

    *** 1.5.1 First enrollment indicator (a) (1996 survey) /*
        We call indicator (a) "ever_enrolled_between_9211_9408"
        - at least 1 months of PS enrollment between november 1992 and january 1994.
        - We use the variable of NUMINST that asks in F3 1994 whether student ever attended post-secondary education.
            For respondents who answered yes, we have data on enrollment for each month between nov 2004 and 2006 */

        * respondents who never attended college as of the F3 1996 , did not have at least one month of enrollment
        codebook NUMINST, tab(20)
        recode NUMINST (-9 = .) (1 2 3 4 5 = 1 "Yes") (0 = 0 "No"), gen(ever_enrolled_between_9211_9408)

        * Generate variable counting months enrolled  between nov 1992 and aug 1994 .
        * This can only be used to code positives. (missing values are not meaningful)
        foreach month_var of varlist  ENRL1192 ENRL1292 ENRL0193 ENRL0293 ENRL0393 ENRL0493 ENRL0593 ENRL0693 ENRL0793 ENRL0893 ENRL0993 ENRL1093 ENRL1193 ENRL1293 ENRL0194 ENRL0294 ENRL0394 ENRL0494 ENRL0594 ENRL0694 ENRL0794 ENRL0894 {
            recode `month_var'	(4 6 10 11 12 14 15 16 20 21 22 24 25 26 30 31 32 34 35 36= 1 "enrolled in higher ed") ///
                                (13 23 33 = 0 "not enrolled in higher ed") /// less than three-year institution is not considered higher ed
                                (-9 -8 -7 -6 -5 -4 -3 -2 -1 = .), gen(`month_var'_bin)
            
        }
        *Summing over month variables (We treat missing as zero)
        egen ps_months9211_9408 = rowtotal(*bin)
        generate one_month = !inlist(ps_months9211_9408,0) //enrolled at least one month

        * Coding students that were enrolled in the period
        replace ever_enrolled_between_9211_9408 = one_month if  one_month == 1 

        * First indicator (a) is done. 

    *** 1.5.2 Second enrollment indicator (b) [F4 method 1 (transcript + survey)] /*
        We call indicator (b) "evr_enrolled_between_1993_1996"
            - "attenders of college in any of the years 1993, 1994, 1995 and 1996 "
        	- started college in one of the years 1993, 1994, 1995 and 1996 according to official transcripts or self report. Or ended PS enrollment between 1993 and 1996.
        	- variables: F3TZPS1START F3TZLSTYRENR	 */

        * respondents who never attended college as of the fourth follow up 2000 , we can be sure, did not attend college in any of the years 1993-1996:
        recode F4ATTPSE (2 = 0 "No") (1 = 2 "Maybe (i.e. ever attended)") (-9 = .) (478 = 1 "Yes"), gen(evr_enrolled_between_1993_1996)
        * The variable F4EFMY has legitimate skip for people without PS-experience (post-secondary).  I.e. it was asked to all who are coded to "maybe", in indicator (b) atm:
        recode F4EFMY (-7 -3 -2 -1 = . )
        recode F4EFSECT (-7 -3 -2 -1 = .)

        * Coding students that started college in this period
        replace evr_enrolled_between_1993_1996 = 1 if inrange(F4EFMY,199211,199610) & inlist(F4EFSECT, 1, 2,4,5,6) 
        * People who started in PSE after t2 , had no enrollment
        replace evr_enrolled_between_1993_1996 = 0 if inrange(F4EFMY,199611,200008)

        **** Respodents who last attended PSE between the t1 and t2 can be coded as enrolled:
        * F4ELMY:  Date most recently attended postsecondary school. In what month and year did you last attend  (school attended most recently)?

        recode F4ELMY (-7 -3 -2 -1 = . ) // Respondents without postsecondary are missing on ELMY
        recode F4ELSECT (-7 -3 -2 -1 = . ) 

        * Coding students that ended college in this period
        replace evr_enrolled_between_1993_1996 = 1 if inrange(F4ELMY,199212,199611) & inlist(F4ELSECT, 1, 2,4,5,6)  

        * The remaining respondents in the "maybe" category had some enrollment, but it can not be determined WHEN this enrollment was. they are therefore coded to missing. 
        recode evr_enrolled_between_1993_1996 (2 = .)

        * This completes second indicator (b)
       
    *** 1.5.3 Combining indicators (a) and (b) into a single variable (CollegeGoing) 
        gen CollegeGoing = evr_enrolled_between_1993_1996 
        label define CollegeGoing 0 "No College" 1 "Attended College"
        label values CollegeGoing CollegeGoing
        replace CollegeGoing = 1 if ever_enrolled_between_9211_9408 == 1 // respondents that are positive on the first indicator should take the value of 1 (we know they attended college in the period)

    *** 1.5.4 Third enrollment indicator (c) - bachelor achievement  [F4 method 2 (survey)] /*
        We want a measure of whether a person had achieved a college bachelor by t2 or not.
        We name indicator (c) s_bachelor_between_1993_1996
            - We use variables: F4ATTPSE and F4HHDG */
       
        * People who by 2000 had never attended PSE should have the value zero (we are positive that they did not have a bachelor)
        recode F4ATTPSE (2 = 0 "No") (1 = 1 "Yes") (-9 = .) , gen(ever_attended_pse_F4)
     
        recode ever_attended_pse_F4 (0 = 0 "Less than bachelor") (1 = 2 "Maybe (ever attended)") (4845 = 1 "recieved bachelor"), gen(s_bachelor_between_1993_1996)
        label variable s_bachelor_between_1993_1996 "received Bachelor degree between 1993 and 1996"

        ** People who by 2000 had certificate/license or  lower as highest PSE attainment, should have the value of zero. (we are positive that they did not have a bachelor)
        recode F4HHDG (-9 -3 = .) 
        replace s_bachelor_between_1993_1996 = 0 if inlist(F4HHDG,1,2) //no degree & certificate/license
        recode s_bachelor_between_1993_1996 (2 = . ) if F4HHDG == . 

        ** Now all respondents with s_bachelor_between_1993_1996 = maybe , have received a BA or higher at some point:
        * We are about to recode people from "2 maybe" ( have received a BA or higher at some point) to either 1, 0 or missing. 

            ** CASE A: Has at least one BA certificate registered:
                * A1: If any BA or above certificate is completed before t2, then recode 2 to 1.
                foreach cert in 1 2 3 4 5 6 {
                    replace s_bachelor_between_1993_1996 = 1 if inlist(F4EDGR`cert',2,3,4,5,6) & inrange(F4ED`cert',199211,199610)
                }
               
                * A2: If all BA certificates are completed after t2 then recode to 0
                * Coding date to missing if not a BA type
                foreach cert in 1 2 3 4 5 6 {	
                    recode F4ED`cert' (-7 -3 -2 -1 = .), gen(BA_F4ED`cert')
                    replace BA_F4ED`cert' = . if inlist(F4EDGR`cert',2,3,4,5,6)!= 1
                }

                * Then finding minimum date among BA certificates.
                egen min_BA_date = rowmin(BA_F4ED1 BA_F4ED2 BA_F4ED3 BA_F4ED4 BA_F4ED5 BA_F4ED6)
                egen max_BA_date = rowmax(BA_F4ED1 BA_F4ED2 BA_F4ED3 BA_F4ED4 BA_F4ED5 BA_F4ED6)
                * If minimum date is after t2, then recode to 0.
                replace s_bachelor_between_1993_1996 = 0  if inrange(min_BA_date, 199611,202008)

            *  CASE B: Does not have any BA certificates:
                * If all certificate dates are missing OR if no certificate date is BA (== highest cert/certdate is missing) code to missing (we do not know if BA was completed before or after t2):
                codebook min_BA_date max_BA_date if s_bachelor_between_1993_1996 == 2, tab(20) 
                recode s_bachelor_between_1993_1996 (2 = .) 

            ** Case C If they received a master or higher in the year just after t2, it is extremely unlikely that they had not recieved a bachelor by t2 (we code them to 1):
            ** In other words, if there is a master certificate or higher that was completed within a year of the election, we code to 1
            foreach cert in 1 2 3 5 6 {
                replace s_bachelor_between_1993_1996 = 1 if inlist(F4EDGR`cert',4,5,6) & inrange(F4ED`cert',199611,199710)
            }

     *** 1.5.5 Merging third indicator (c) with CollegeGoing (Which combined indicator (a) and (b)) 
        replace CollegeGoing = 1 if s_bachelor_between_1993_1996 == 1 
        recode CollegeGoing (1 = .) if missing(s_bachelor_between_1993_1996)
    
    * Variable (CollegeGoing) measuring college enrollment between 1992 and 1996 is done. 

*** 1.6 Year of birth / age -- eligible for voting at t1 (1992). We simply make voting missing if they are not eligible.
    *** must be born in the 10th month of 1974 or earlier
    recode   F2BIRTHY  (98 99 = .)
    *Exclude missings
    gen eligible_voter_1992 = inlist(F2BIRTHY,73, 72)
    replace eligible_voter_1992 = 1 if inlist(F2BIRTHY,74)
    replace eligible_voter_1992 = 0 if inlist(F2BIRTHY,74) & inlist(F2BIRTHM,11,12)
    replace voted92 = . if eligible_voter_1992 == 0

*** 1.7 missingness indicators:
    missings tag voted92 voted96 CollegeGoing, generate(mis_on_voting_and_cg)
    recode mis_on_voting_and_cg (1 2 3 4 5 = 1)

*** 1.8 Rreshape from wide to long data
    sort STU_ID
    reshape long voted, i(STU_ID) j(year)
    label variable voted "Whether respondent voted"
        
    recode year (92 = 0) (96 = 1), gen(PostPeriod)
    label variable PostPeriod "1 if 1996"
    label define PostPeriod 0 "1992" 1 "1996"
    label values PostPeriod PostPeriod
    drop year

*** 1.9 The key treatment indiciator (treatment_level_at_t2) based on CollegeGoing variable:
    * This variable is not varying over time. 
    rename CollegeGoing treatment_level_at_t2

*** 1.10 Future College Attendance for restricted control group
    * We use the variables:	F4ATTPSE F4EFMY 
    clonevar restr_treatment_level_at_t2 = treatment_level_at_t2
    recode restr_treatment_level_at_t2 (0 = .) if F4ATTPSE == 2
    replace restr_treatment_level_at_t2 = 0 if inrange(F4EFMY,199611,200008)

*** 1.11 Time-varying variables:
    * residential mobility /*
        We proxy this by the change in diversity between the neighborhood the student grew up in and the neighborhood the student currently lives in.
        - F3: "RACEGREW":	PERCENT SAME RACE NEIGHBORHOOD GREW UP
        - F3  "RACEPRES":	PERCENT SAME RACE CURRENT NEIGHBORHOOD */

        recode RACEGREW RACEPRES (-9 -8 -7 -6 = .)
        recode F4JRDVA F4JRDVB (-7 -3 -2 -1 = .)

        *folded "distance" in terms of change in diversity
        gen abs_div_change = abs(RACEPRES - RACEGREW)
        xtile div_change_q = abs_div_change , nq(5)
        rename div_change_q xx_div_change_q
        recode xx_div_change_q (1 = 0 ) (3 = 1) (4 = 2) (5 = 3)
        rename xx_div_change_q  div_change_q_at_t2 
        gen xx_div_change_q = div_change_q_at_t2*PostPeriod

*** 1.12 Other Time-invariant variables:
    * Voting in 1992
    recode VOTEPRES (-9 -8 -6 = .) (1 = 1 "Yes") (2 = 0 "No"), gen(voted_in_PRE_PERIOD)
    
    * Some variables for matching: SES, gender and 1992 voting:
    clonevar m_ses = F2SES1Q 
    clonevar m_voteW1 = voted_in_PRE_PERIOD
    recode F2SEX (1=0 "female") (2=1 "male"), gen(m_sex)
    
    * socio-economic status for interaction analyses
    recode m_ses (1  = 1 "Bottom Quartile Parental SES") (2 3 4 = 0 "Quartile 2-4 Parental SES"), gen(low_ses)
    
    * Binary race variable for interaction analyses
     recode m_race (1 2 = 1 "Black, Hispanic or other") (3 4 = 0 "White (non hispanic) or asian"), gen(non_white_race)

*** 1.13 Loading weights from genetic matching performed in R 
    merge m:1 STU_ID using "NELS_matching_weights.dta", keep(master match) keepusing(weights*) 

*** 1.14 Saving data in long format using xt-framework
    order STU_ID PostPeriod voted  treatment_level_at_t2 
    xtset STU_ID

    save "analysis_data_NELS.dta" //, replace
